3T_Pandas로 배우는 SQL 시작하기 (3) - GROUP BY

"sakila" 데이터베이스에 있는, "rental" 테이블에서 => rental_df

월별 렌탈 수 "출력" (혹은 DataFrame으로 만들기)

2015-05 ___
2016-06 ___
단, 날짜는 오름차순으로 정렬되어 있어야 합니다.



In [1]:

    
import pymysql

db = pymysql.connect(
    "db.fastcamp.us",
    "root",             
    "dkstncks",
    "sakila",
    charset='utf8',
)
rental_df = pd.read_sql("SELECT * FROM rental;", db)



In [7]:

    
rental_df = rental_df[["rental_id", "rental_date"]]



In [8]:

    
rental_df["month"] = rental_df["rental_date"].apply(lambda x: str(x)[:7])
rental_df.head()









    Out[8]:






  
    
      
      rental_id
      rental_date
      month
    
  
  
    
      0
      1
      2005-05-24 22:53:30
      2005-05
    
    
      1
      2
      2005-05-24 22:54:33
      2005-05
    
    
      2
      3
      2005-05-24 23:03:39
      2005-05
    
    
      3
      4
      2005-05-24 23:04:41
      2005-05
    
    
      4
      5
      2005-05-24 23:05:21
      2005-05



In [9]:

    
month_groups = rental_df.groupby("month")



In [12]:

    
month_groups.get_group("2005-05")









    Out[12]:






  
    
      
      rental_id
      rental_date
      month
    
  
  
    
      0
      1
      2005-05-24 22:53:30
      2005-05
    
    
      1
      2
      2005-05-24 22:54:33
      2005-05
    
    
      2
      3
      2005-05-24 23:03:39
      2005-05
    
    
      3
      4
      2005-05-24 23:04:41
      2005-05
    
    
      4
      5
      2005-05-24 23:05:21
      2005-05
    
    
      5
      6
      2005-05-24 23:08:07
      2005-05
    
    
      6
      7
      2005-05-24 23:11:53
      2005-05
    
    
      7
      8
      2005-05-24 23:31:46
      2005-05
    
    
      8
      9
      2005-05-25 00:00:40
      2005-05
    
    
      9
      10
      2005-05-25 00:02:21
      2005-05
    
    
      10
      11
      2005-05-25 00:09:02
      2005-05
    
    
      11
      12
      2005-05-25 00:19:27
      2005-05
    
    
      12
      13
      2005-05-25 00:22:55
      2005-05
    
    
      13
      14
      2005-05-25 00:31:15
      2005-05
    
    
      14
      15
      2005-05-25 00:39:22
      2005-05
    
    
      15
      16
      2005-05-25 00:43:11
      2005-05
    
    
      16
      17
      2005-05-25 01:06:36
      2005-05
    
    
      17
      18
      2005-05-25 01:10:47
      2005-05
    
    
      18
      19
      2005-05-25 01:17:24
      2005-05
    
    
      19
      20
      2005-05-25 01:48:41
      2005-05
    
    
      20
      21
      2005-05-25 01:59:46
      2005-05
    
    
      21
      22
      2005-05-25 02:19:23
      2005-05
    
    
      22
      23
      2005-05-25 02:40:21
      2005-05
    
    
      23
      24
      2005-05-25 02:53:02
      2005-05
    
    
      24
      25
      2005-05-25 03:21:20
      2005-05
    
    
      25
      26
      2005-05-25 03:36:50
      2005-05
    
    
      26
      27
      2005-05-25 03:41:50
      2005-05
    
    
      27
      28
      2005-05-25 03:42:37
      2005-05
    
    
      28
      29
      2005-05-25 03:47:12
      2005-05
    
    
      29
      30
      2005-05-25 04:01:32
      2005-05
    
    
      ...
      ...
      ...
      ...
    
    
      1126
      1128
      2005-05-31 17:49:26
      2005-05
    
    
      1127
      1129
      2005-05-31 18:00:48
      2005-05
    
    
      1128
      1130
      2005-05-31 18:13:57
      2005-05
    
    
      1129
      1131
      2005-05-31 18:44:19
      2005-05
    
    
      1130
      1132
      2005-05-31 18:44:53
      2005-05
    
    
      1131
      1133
      2005-05-31 19:12:21
      2005-05
    
    
      1132
      1134
      2005-05-31 19:14:15
      2005-05
    
    
      1133
      1135
      2005-05-31 19:15:11
      2005-05
    
    
      1134
      1136
      2005-05-31 19:19:36
      2005-05
    
    
      1135
      1137
      2005-05-31 19:20:14
      2005-05
    
    
      1136
      1138
      2005-05-31 19:30:27
      2005-05
    
    
      1137
      1139
      2005-05-31 19:34:52
      2005-05
    
    
      1138
      1140
      2005-05-31 19:36:30
      2005-05
    
    
      1139
      1141
      2005-05-31 19:42:02
      2005-05
    
    
      1140
      1142
      2005-05-31 19:46:38
      2005-05
    
    
      1141
      1143
      2005-05-31 19:53:03
      2005-05
    
    
      1142
      1144
      2005-05-31 20:04:10
      2005-05
    
    
      1143
      1145
      2005-05-31 20:13:45
      2005-05
    
    
      1144
      1146
      2005-05-31 20:34:45
      2005-05
    
    
      1145
      1147
      2005-05-31 20:37:52
      2005-05
    
    
      1146
      1148
      2005-05-31 20:38:40
      2005-05
    
    
      1147
      1149
      2005-05-31 21:03:17
      2005-05
    
    
      1148
      1150
      2005-05-31 21:20:09
      2005-05
    
    
      1149
      1151
      2005-05-31 21:29:00
      2005-05
    
    
      1150
      1152
      2005-05-31 21:32:17
      2005-05
    
    
      1151
      1153
      2005-05-31 21:36:44
      2005-05
    
    
      1152
      1154
      2005-05-31 21:42:09
      2005-05
    
    
      1153
      1155
      2005-05-31 22:17:11
      2005-05
    
    
      1154
      1156
      2005-05-31 22:37:34
      2005-05
    
    
      1155
      1157
      2005-05-31 22:47:45
      2005-05
    
  

1156 rows × 3 columns



In [10]:

    
rental_df["month"].unique()









    Out[10]:





array(['2005-05', '2005-06', '2005-07', '2005-08', '2006-02'], dtype=object)



In [16]:

    
len(month_groups.get_group("2005-05"))









    Out[16]:





1156



In [13]:

    
pd.DataFrame([
        {
            "month": month,
            "rental count": len(month_groups.get_group(month)),
        }
        for month
        in rental_df["month"].unique()
    ])









    Out[13]:






  
    
      
      month
      rental count
    
  
  
    
      0
      2005-05
      1156
    
    
      1
      2005-06
      2311
    
    
      2
      2005-07
      6709
    
    
      3
      2005-08
      5686
    
    
      4
      2006-02
      182



In [18]:

    
rental_df.groupby("month")









    Out[18]:





<pandas.core.groupby.DataFrameGroupBy object at 0x000000000B8E4358>



In [17]:

    
rental_df.groupby("month").size()









    Out[17]:





month
2005-05    1156
2005-06    2311
2005-07    6709
2005-08    5686
2006-02     182
dtype: int64



In [19]:

    
rental_df.groupby("month").agg({"rental_id": np.size})



In [24]:

    
SQL_QUERY = """
    SELECT LEFT(rental_date, 7) "Month", COUNT(*) "Total Rental"
    FROM rental
    GROUP BY Month
    ;
"""

pd.read_sql(SQL_QUERY, db)









    Out[24]:






  
    
      
      Month
      Total Rental
    
  
  
    
      0
      2005-05
      1156
    
    
      1
      2005-06
      2311
    
    
      2
      2005-07
      6709
    
    
      3
      2005-08
      5686
    
    
      4
      2006-02
      182

payment

월별 전체 매출, 평균 매출, 결제 갯수(전체 매출 = 평균 매출 * 결제 갯수)
pandas



In [39]:

    
SQL_QUERY = """
    SELECT *
    FROM payment
    ;
"""
payment_df = pd.read_sql(SQL_QUERY, db)



In [40]:

    
payment_df = payment_df[["payment_date", "payment_id", "amount"]]



In [41]:

    
payment_df.head(1)









    Out[41]:






  
    
      
      payment_date
      payment_id
      amount
    
  
  
    
      0
      2005-05-25 11:30:37
      1
      2.99



In [42]:

    
payment_df["month"] = payment_df["payment_date"].apply(lambda x: str(x)[:7])



In [46]:

    
payment_df.groupby("month").agg({"payment_id": np.size, "amount": [np.sum, np.mean]})



In [47]:

    
# 마지막으로 sql



In [54]:

    
SQL_QUERY = """
    SELECT
        LEFT(payment_date, 7) Month,
        count(*) "Total Payment",
        SUM(amount) "Total Revenue",
        AVG(amount) "Average Payment"
    FROM payment
    GROUP BY Month
    ;
"""

pd.read_sql(SQL_QUERY, db)









    Out[54]:






  
    
      
      Month
      Total Payment
      Total Revenue
      Average Payment
    
  
  
    
      0
      2005-05
      1157
      4824.43
      4.169775
    
    
      1
      2005-06
      2312
      9631.88
      4.166038
    
    
      2
      2005-07
      6711
      28373.89
      4.227968
    
    
      3
      2005-08
      5687
      24072.13
      4.232835
    
    
      4
      2006-02
      182
      514.18
      2.825165

오늘 배운 것 정리

SELECT __ FROM;
WHERE -> Filtering ( 1. 조건, 2. IN, 3. LIKE % )
ORDER BY ASC/DESC
JOIN -> 1. SELECT FROM , WHERE _ ( 원리 ) / 2. JOIN __ ON
GROUP BY -> 특정 칼럼을 기준으로 그룹화 => 의미 있는 지표

해보고 싶으면 해 봐

결제 누적액이 많은 유저 상위 10명 ( customer, payment )
영화를 흥행시킨 ( 매출이 많이 발생한 ) 배우 상위 10명
rental, payment ...

	rental_id	rental_date	month
0	1	2005-05-24 22:53:30	2005-05
1	2	2005-05-24 22:54:33	2005-05
2	3	2005-05-24 23:03:39	2005-05
3	4	2005-05-24 23:04:41	2005-05
4	5	2005-05-24 23:05:21	2005-05

	payment_id	amount
	size	sum	mean
month
2005-05	1157	4824.43	4.169775
2005-06	2312	9631.88	4.166038
2005-07	6711	28373.89	4.227968
2005-08	5687	24072.13	4.232835
2006-02	182	514.18	2.825165